Aim

Visualise some aspect of the data you find interesting, e.g., the average number of free throws per period for the regular season and the playoffs.

Set up

Load analysis packages.

library(tidyverse)
#> Loading tidyverse: ggplot2
#> Loading tidyverse: tibble
#> Loading tidyverse: tidyr
#> Loading tidyverse: readr
#> Loading tidyverse: purrr
#> Loading tidyverse: dplyr
#> Conflicts with tidy packages ----------------------------------------------
#> filter(): dplyr, stats
#> lag():    dplyr, stats
library(MangoTest)
library(ggridges)
library(plotly)
#> 
#> Attaching package: 'plotly'
#> The following object is masked from 'package:ggplot2':
#> 
#>     last_plot
#> The following object is masked from 'package:stats':
#> 
#>     filter
#> The following object is masked from 'package:graphics':
#> 
#>     layout

Data

Load in the data and explore structure. Data lists the play by play scores in multiple games accross several seasons.

str(free_throws)
#> Classes 'tbl_df', 'tbl' and 'data.frame':    618019 obs. of  11 variables:
#>  $ end_result: chr  "106 - 114" "106 - 114" "106 - 114" "106 - 114" ...
#>  $ game      : chr  "PHX - LAL" "PHX - LAL" "PHX - LAL" "PHX - LAL" ...
#>  $ game_id   : num  2.61e+08 2.61e+08 2.61e+08 2.61e+08 2.61e+08 ...
#>  $ period    : num  1 1 1 1 1 1 1 2 2 2 ...
#>  $ play      : chr  "Andrew Bynum makes free throw 1 of 2" "Andrew Bynum makes free throw 2 of 2" "Andrew Bynum makes free throw 1 of 2" "Andrew Bynum misses free throw 2 of 2" ...
#>  $ player    : chr  "Andrew Bynum" "Andrew Bynum" "Andrew Bynum" "Andrew Bynum" ...
#>  $ playoffs  : chr  "regular" "regular" "regular" "regular" ...
#>  $ score     : chr  "0 - 1" "0 - 2" "18 - 12" "18 - 12" ...
#>  $ season    : chr  "2006 - 2007" "2006 - 2007" "2006 - 2007" "2006 - 2007" ...
#>  $ shot_made : int  1 1 1 0 1 1 1 0 1 1 ...
#>  $ time      :Classes 'hms', 'difftime'  atomic [1:618019] 42300 42300 26760 26760 26280 ...
#>   .. ..- attr(*, "units")= chr "secs"
#>  - attr(*, "spec")=List of 2
#>   ..$ cols   :List of 11
#>   .. ..$ end_result: list()
#>   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
#>   .. ..$ game      : list()
#>   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
#>   .. ..$ game_id   : list()
#>   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
#>   .. ..$ period    : list()
#>   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
#>   .. ..$ play      : list()
#>   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
#>   .. ..$ player    : list()
#>   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
#>   .. ..$ playoffs  : list()
#>   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
#>   .. ..$ score     : list()
#>   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
#>   .. ..$ season    : list()
#>   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
#>   .. ..$ shot_made : list()
#>   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
#>   .. ..$ time      :List of 1
#>   .. .. ..$ format: chr ""
#>   .. .. ..- attr(*, "class")= chr  "collector_time" "collector"
#>   ..$ default: list()
#>   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
#>   ..- attr(*, "class")= chr "col_spec"

Split out teams, and scores in each game.

tidy_free_throws <- free_throws %>% 
  separate(game, c("team_1", "team_2"), " - ") %>% 
  separate(score, c("score_1", "score_2"), " - ") %>% 
  separate(end_result, c("end_result_1", "end_result_2"), " - ")

Seperate game wide information, play information and scores information. Convert to factors as appropriate.

games_df <- tidy_free_throws %>% 
  select(game_id, team_1, team_2, playoffs,
         season, end_result_1, end_result_2) %>% 
  unique %>% 
  mutate(draw = ifelse(end_result_1 == end_result_2, "Yes", "No")) %>% 
  gather(key = "order", value = "team", team_1, team_2) %>% 
  separate(order, c("tmp", "order"), "_") %>% 
  select(-tmp) %>% 
  gather(key = "order2", value = "end_result", 
         end_result_1, end_result_2) %>% 
  separate(order2, c("tmp", "order2"), "result_") %>% 
  select(-tmp) %>% 
  filter(order == order2) %>% 
  select(-order2) %>% 
  mutate_at(.vars = vars(order, team, season, playoffs),
            .funs = funs(factor(.))) %>% 
  mutate(end_result = as.numeric(end_result)) %>% 
  mutate(season = factor(season, levels = rev(paste0(2006:2015, " - ", 2007:2016))))

plays_df <- tidy_free_throws %>% 
  select(game_id, period, play, player, shot_made, time) %>% 
  mutate(player = factor(player))

scores_df <- tidy_free_throws %>% 
  select(game_id, score_1, score_2, period, time) %>% 
    gather(key = "order", value = "score", score_1, score_2) %>% 
  separate(order, c("tmp", "order"), "_") %>% 
  select(-tmp) %>% 
  mutate(order = factor(order)) %>% 
  mutate(score = as.numeric(score))

Add game outcome variable.

games_df <- games_df %>% 
  group_by(game_id) %>%
  arrange(desc(end_result), .by_group = TRUE) %>% 
  mutate(outcome = c("Won", "Lost")) %>% 
  ungroup() %>% 
  mutate(outcome = ifelse(draw %in% "Yes", "Draw", outcome)) %>% 
  mutate(outcome = factor(outcome)) %>% 
  select(-draw)

Summarise each dataset in turn. No draws in the dataset - online search suggests NBA games are played until a team wins.

summary(games_df)
#>     game_id              playoffs             season     order    
#>  Min.   :261031013   playoffs: 1672   2013 - 2014:2634   1:12874  
#>  1st Qu.:290119029   regular :24076   2008 - 2009:2632   2:12874  
#>  Median :310410028                    2015 - 2016:2626            
#>  Mean   :336085907                    2009 - 2010:2626            
#>  3rd Qu.:400489596                    2010 - 2011:2624            
#>  Max.   :400878160                    2012 - 2013:2622            
#>                                       (Other)    :9984            
#>       team         end_result     outcome     
#>  SA     :  934   Min.   : 54.00   Lost:12874  
#>  MIA    :  918   1st Qu.: 91.00   Won :12874  
#>  BOS    :  910   Median : 99.00               
#>  CLE    :  902   Mean   : 99.55               
#>  LAL    :  901   3rd Qu.:107.00               
#>  ATL    :  886   Max.   :168.00               
#>  (Other):20297
summary(plays_df)
#>     game_id              period          play          
#>  Min.   :261031013   Min.   :1.000   Length:618019     
#>  1st Qu.:281226023   1st Qu.:2.000   Class :character  
#>  Median :310306001   Median :3.000   Mode  :character  
#>  Mean   :333936881   Mean   :2.696                     
#>  3rd Qu.:400489501   3rd Qu.:4.000                     
#>  Max.   :400878160   Max.   :8.000                     
#>                                                        
#>              player         shot_made          time         
#>  LeBron James   :  8001   Min.   :0.0000   Length:618019    
#>  Dwight Howard  :  7728   1st Qu.:1.0000   Class1:hms       
#>  Kevin Durant   :  6030   Median :1.0000   Class2:difftime  
#>  Dwyane Wade    :  5594   Mean   :0.7568   Mode  :numeric   
#>  Kobe Bryant    :  5594   3rd Qu.:1.0000                    
#>  Carmelo Anthony:  5318   Max.   :1.0000                    
#>  (Other)        :579754
summary(scores_df)
#>     game_id              period          time          order     
#>  Min.   :261031013   Min.   :1.000   Length:1236038    1:618019  
#>  1st Qu.:281226023   1st Qu.:2.000   Class1:hms        2:618019  
#>  Median :310306001   Median :3.000   Class2:difftime             
#>  Mean   :333936881   Mean   :2.696   Mode  :numeric              
#>  3rd Qu.:400489501   3rd Qu.:4.000                               
#>  Max.   :400878160   Max.   :8.000                               
#>      score       
#>  Min.   :  0.00  
#>  1st Qu.: 32.00  
#>  Median : 57.00  
#>  Mean   : 56.47  
#>  3rd Qu.: 81.00  
#>  Max.   :166.00

Visualisations

Comparision of Final Scores over Multiple Seasons

games_df %>%
  ggplot(aes(x = end_result, y = season, fill = outcome)) +
  geom_density_ridges(alpha = 0.6) +
  theme_minimal() +
  scale_fill_viridis_d() +
  guides(fill = guide_legend(title = "Game Outcome")) +
  labs(x = "Final Score",
       y = "Season",
       title = "Distribution of NBA Final Scores",
       subtitle = "By Game Outcome from 2006 until 2016",
       caption = "By Sam Abbott, for Mango Solutions. Source: NBA Free Throws")
#> Picking joint bandwidth of 2.26

Times at Which Points were Scored During Play

scores_df %>% 
  left_join(games_df %>% 
              select(game_id, order, team, outcome, season),
            by = c("game_id", "order")) %>%  
  mutate(period = factor(period)) %>% 
  group_by(team, period, game_id) %>% 
  arrange(desc(score), .by_group = TRUE) %>% 
  slice(1) %>%
  ungroup %>% 
  ggplot(aes(x = score, y = period,
             fill = outcome)) +
  geom_density_ridges(alpha = 0.6) +
  scale_fill_viridis_d() +
  theme_minimal() +
  guides(fill = guide_legend(title = "Game Outcome")) +
  labs(x = "Score",
       y = "Period",
       title = "Distribution of NBA Scores during Play",
       subtitle = "By Game Outcome for all Teams: 2006 - 2016",
       caption = "By Sam Abbott, for Mango Solutions. Source: NBA Free Throws")
#> Picking joint bandwidth of 2.27

Conversion Rates of Free Throws by Player

conv_player_df <- plays_df %>% 
  left_join(games_df %>% 
              select(game_id, season) %>% 
              unique,
            by = "game_id") %>% 
  group_by(player, season) %>% 
  summarise(conversion = mean(shot_made), shots = n()) %>% 
  mutate(conversion = round(conversion, digits = 3)) %>% 
  ungroup

conv_player_df %>% 
  plot_ly(y = ~conversion*100, x = ~shots, 
          text = ~player,
          frame = ~season) %>%
  add_markers() %>% 
  layout(yaxis = list(title = "Conversion Rate", range = c(0, 110))) %>% 
  layout(yaxis = list(ticksuffix = "%")) %>% 
  layout(xaxis = list(title = "Free Throws")) %>% 
  layout(title = "NBA Free Throw Conversion Rate") %>% 
  animation_opts(2000, redraw = FALSE, easing = "elastic") %>% 
  animation_slider(currentvalue = list(prefix = "Season: ")) %>% 
  hide_legend()